How Embryos Acclimate to Temperature through Epigenetic Regulation

Author

Thomas O’Leary, Emily Mikucki, Sumaetee Tangwancharoen, Sara Helms Cahan, Seth Frietze, Brent L. Lockwood

Code

cs_pheno.R
Code
# ------------------------------------------------------------------------------
# Canton S acclimation phenotype figure
# March 31, 2023
# TS O'Leary
# ------------------------------------------------------------------------------

# Description -----
# Canton S phenotype figure

# Load libraries
library(tidyverse)

# Load data
dat <- read_csv(here::here("data/raw/pheno/embryo_acc_pheno.csv"))

# Filter only Canton S data
dat <- dat |>
  filter(Genotype == "CantonS") |>
  filter(Stage == "Early")

# Count the number of eggs per acclimation treatment
dat |>
  group_by(Acclimation) |>
  summarise(total_eggs = sum(Number_Eggs))

# Acclimation effect
dat |>
  glm(Survival ~ Acclimation, 
      data = .,
      weights = Number_Eggs,
      family = binomial(link = "logit"))
00_mkref.sh
Code
#!/bin/bash

# Make a reference package for D. melanogaster ---------------------------------

# Directions for running this script -------------------------------------------
# 0. Be sure the following things have happened before beginning
#   - Download FASTA and GTF files for D. melanogaster from Ensembl
#   - Create a contig file according to 10X Genomics instructions
# 1. In the command line, run the following command: bash path/to/this/file.sh

# Set up directories and variables ---------------------------------------------

# Move to the directory where you want the output files to be saved
cd /netfiles02/lockwood_lab/heater/data/processed/seq/ref/

# Point towards installed cellranger-arc program
export PATH=/netfiles02/lockwood_lab/cellranger-arc/cellranger-arc-2.0.2:$PATH

# # Set variables for the script
# ref_path="/netfiles02/lockwood_lab/heater/data/processed/seq/ref/"
# libraries_path="/netfiles02/lockwood_lab/heater/data/raw/seq/libraries/"

# Create a raw unfiltered reference package ------------------------------------
cellranger-arc mkref --config=BDGP6.32.config

# Filter the GTF to restrict it to protein-coding, lncRNA, antisense, and 
#   immune-related genes as 10X Genomics has done for their human and mouse refs
cellranger-arc mkgtf Drosophila_melanogaster.BDGP6.32.109.gtf \
  Drosophila_melanogaster.BDGP6.32.109_filtered.gtf \
  --attribute=gene_biotype:protein_coding \
  --attribute=gene_biotype:lncRNA \
  --attribute=gene_biotype:antisense \
  --attribute=gene_biotype:IG_LV_gene \
  --attribute=gene_biotype:IG_V_gene \
  --attribute=gene_biotype:IG_V_pseudogene \
  --attribute=gene_biotype:IG_D_gene \
  --attribute=gene_biotype:IG_J_gene \
  --attribute=gene_biotype:IG_J_pseudogene \
  --attribute=gene_biotype:IG_C_gene \
  --attribute=gene_biotype:IG_C_pseudogene \
  --attribute=gene_biotype:TR_V_gene \
  --attribute=gene_biotype:TR_V_pseudogene \
  --attribute=gene_biotype:TR_D_gene \
  --attribute=gene_biotype:TR_J_gene \
  --attribute=gene_biotype:TR_J_pseudogene \
  --attribute=gene_biotype:TR_C_gene

# Create a filtered reference package ------------------------------------------
cellranger-arc mkref --config=BDGP6.32.filtered.config 
01_count.sh
Code
#!/bin/bash

# Directions for running this script -------------------------------------------
# 0. Be sure the following things have happened before beginning
#   - Demultiplex all samples
#   - Create reference package
#   - Create a separate libraries csv file pointing to demuxed fastq files
#   - Set up slurm.template file according to 10X Genomics instructions in 
#      cellranger-arc-2.0.2/external/martian/jobmanagers
# 1. In the command line, run the following command: sbatch path/to/this/file.sh

# Set up directories and variables ---------------------------------------------

# Move to the directory where you want the output files to be saved
cd /netfiles02/lockwood_lab/heater/data/processed/seq/samples/

# Point towards installed cellranger-arc program
export PATH=/netfiles02/lockwood_lab/cellranger-arc/cellranger-arc-2.0.2:$PATH

# Set variables for the script
ref_path="/netfiles02/lockwood_lab/heater/data/raw/seq/ref/BDGP6_32_filtered"
libraries_path="/netfiles02/lockwood_lab/heater/data/raw/seq/libraries/"
n_cores=32
mem=64

# Run cellranger-arc count for each sample -------------------------------------

# Sample 18°C Rep 1
cellranger-arc count --id=18C_Rep1 \
  --reference=${ref_path} \
  --libraries=${libraries_path}/18C_Rep1.csv \
  --localcores=${n_cores} \
  --localmem=${mem} \
  --jobmode=slurm
  
# Sample 18°C Rep 2
cellranger-arc count --id=18C_Rep2 \
  --reference=${ref_path} \
  --libraries=${libraries_path}/18C_Rep2.csv \
  --localcores=${n_cores} \
  --localmem=${mem} \
  --jobmode=slurm
 
# Sample 25°C Rep 1
cellranger-arc count --id=25C_Rep1 \
  --reference=${ref_path} \
  --libraries=${libraries_path}/25C_Rep1.csv \
  --localcores=${n_cores} \
  --localmem=${mem} \
  --jobmode=slurm
  
# Sample 25°C Rep 2
cellranger-arc count --id=25C_Rep2 \
  --reference=${ref_path} \
  --libraries=${libraries_path}/25C_Rep2.csv \
  --localcores=${n_cores} \
  --localmem=${mem} \
  --jobmode=slurm
02_aggr.sh
Code
#!/bin/bash

# Aggregate together all samples -----------------------------------------------

# Directions for running this script -------------------------------------------
# 0. Before running run cellranger-arc count for all samples and create 
#      aggr_libraries.csv file pointing to atac_fragments.tsv.gz and 
#      gex_molecule_info.h5 files per 10X Genomics instructions.
# 1. In the command line, run the following command: sbatch path/to/this/file.sh

# Request cluster resources ----------------------------------------------------

# Specify partition
#SBATCH --partition=bluemoon

# Request nodes
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=12

# Request memory for the entire job -- you can request --mem OR --mem-per-cpu
#SBATCH --mem=31000

# Reserve walltime -- hh:mm:ss -- 30 hrs max below
#SBATCH --time=30:00:00

# Name this job
#SBATCH --job-name=aggr_all

# Name output of this job using %x=job-name and %j=job-id
#SBATCH --output=logs/%x_%j.out 

# Echo some useful and interesting information 
echo "Starting sbatch script at: `date`"
echo "  running host:    ${SLURMD_NODENAME}"
echo "  assigned nodes:  ${SLURM_JOB_NODELIST}"
echo "  partition used:  ${SLURM_JOB_PARTITION}"
echo "  jobid:           ${SLURM_JOBID}"

# Set up directories and variables ---------------------------------------------

# Move to the directory where files will be aggregated
cd /netfiles02/lockwood_lab/heater/data/processed/seq/

# Point towards installed cellranger-arc program
export PATH=/netfiles02/lockwood_lab/cellranger-arc/cellranger-arc-2.0.2:$PATH

# Set variables for the script
ref_path="/netfiles02/lockwood_lab/heater/data/raw/seq/ref/BDGP6_32_filtered"
libraries_path="/netfiles02/lockwood_lab/heater/data/processed/seq/samples/aggr_libraries.csv"

# Run cellranger-arc aggr to aggregate all samples together --------------------
cellranger-arc aggr --id=all \
  --csv=${libraries_path} \
  --reference=${ref_path} \
  --normalize=depth
00_create_seurat_object.R
Code
# ------------------------------------------------------------------------------
# Load data and set up Seurat object
# May 11, 2023
# TS O'Leary
# ------------------------------------------------------------------------------

# Description 
# Load in the RNA-Seq count data and the ATAC-Seq peak fragment data and save as
# an rds file to be used for quality control and filtering.

# Load libraries
library(tidyverse)
library(Seurat)
library(Signac)
library(AnnotationHub)

# Load the RNA-Seq and ATAC-Seq data -------------------------------------------
data_dir <- here::here("data/processed/seq/all/outs/")
raw_feature_dir <- paste0(data_dir, "raw_feature_bc_matrix")
frag_path <- paste0(data_dir, "atac_fragments.tsv.gz")
counts <- Read10X(raw_feature_dir)

# Get gene annotations for dm6 -------------------------------------------------
BDGP6.32 <- query(AnnotationHub(), 
                  c("EnsDb", "Drosophila melanogaster", "109"))[[1]]
annotation <- GetGRangesFromEnsDb(BDGP6.32)

# Meta data --------------------------------------------------------------------
# Create meta data for all cells based on the GEM well suffix. From 10X Genomics
# "This number, which indicates which GEM well the barcode sequence came from, 
# is called the GEM well suffix. The numbering of the GEM wells will reflect the 
# order that the GEM wells were provided in the Aggregation CSV."
meta_data <- tibble(cellnames = colnames(counts[[1]])) |> 
  mutate(orig.ident = str_extract(cellnames, "[1-4]")) |> 
  full_join(tibble::tribble(
    ~orig.ident, ~sample_name, ~acc_temp,
    "1", "18C_Rep1", "18°C",
    "2", "18C_Rep2", "18°C",
    "3", "25C_Rep1", "25°C",
    "4", "25C_Rep2", "25°C"
  ), 
  by = "orig.ident") |> 
  column_to_rownames("cellnames")

# Create a Seurat object containing the RNA data
dat <- CreateSeuratObject(
  project = "heater",
  counts = counts$`Gene Expression`,
  assay = "RNA",
  names.delim = "-",
  meta.data = meta_data
)

# Create ATAC assay and add it to the object
dat[["ATAC"]] <- CreateChromatinAssay(
  counts = counts$Peaks,
  sep = c(":", "-"),
  fragments = frag_path,
  annotation = annotation,
  validate.fragments = FALSE
)

# Save raw Seurat object
saveRDS(dat, here::here("data/processed/seurat_object/00_dat_raw.rds"))

# Create Seurat object from the filtered matrix from 10X Cell Ranger ARC -------

# Load filtered data
filtered_feature_dir <- paste0(data_dir, "filtered_feature_bc_matrix")
counts <- Read10X(filtered_feature_dir)

# Meta data for counts from filtered counts
meta_data <- tibble(cellnames = colnames(counts[[1]])) |>
  mutate(orig.ident = str_extract(cellnames, "[1-4]")) |>
  full_join(tibble::tribble(
    ~orig.ident, ~sample_name, ~acc_temp,
    "1", "18C_Rep1", "18°C",
    "2", "18C_Rep2", "18°C",
    "3", "25C_Rep1", "25°C",
    "4", "25C_Rep2", "25°C"
  ), 
  by = "orig.ident") |>
  column_to_rownames("cellnames")

# Create a Seurat object containing the RNA data
dat <- CreateSeuratObject(
  project = "heater",
  counts = counts$`Gene Expression`,
  assay = "RNA",
  names.delim = "-",
  meta.data = meta_data
)

# Create ATAC assay and add it to the object
dat[["ATAC"]] <- CreateChromatinAssay(
  counts = counts$Peaks,
  sep = c(":", "-"),
  fragments = frag_path,
  annotation = annotation
)

# Save 10X Cell Ranger ARC filtered Seurat object
saveRDS(dat, here::here("data/processed/seurat_object/00_dat_10x_cells.rds"))
01_quality_control_filtering.R
Code
# ------------------------------------------------------------------------------
# Quality control and filtering of low-quality cells
# March 27, 2023
# TS O'Leary
# ------------------------------------------------------------------------------

# Description -----
# Basic quality control metrics and filtering out non-cell barcodes from the 10X 
# Cell Ranger ARC called cells.

# Load libraries
library(tidyverse)
library(Seurat)
library(Signac)

# Load 10X filtered data
dat <- readRDS(
  here::here("data/processed/seurat_object/00_dat_10x_cells.rds")
)

# Calculate Nucleosome Signal and TSS Enrichment for ATAC QC metrics
DefaultAssay(dat) <- "ATAC"
dat <- NucleosomeSignal(dat)
dat <- TSSEnrichment(dat)

# Calculate the percentage of RNA reads that map to mitochondrial genes 
DefaultAssay(dat) <- "RNA"
dat[["percent.mt"]] <- PercentageFeatureSet(
  dat,
  pattern = "^mt:"
)
# Calculate the percentage of RNA reads that map to ribosomal genes
dat[["percent.ribo"]] <- PercentageFeatureSet(
  data, 
  pattern = "^Rp[S|L]"
)

# Save object with Nucleosome, TSS, and mitochondria meta.data added 
saveRDS(dat, "data/processed/seurat_object/01_dat_10x_cells.rds")

# Filter out barcodes with low counts or high mitochondrial content ------------

# Define filtering thresholds
low_ATAC <- 800
low_RNA <- 200
max_mt <- 5 #### Calderon did 25 % for both mt and rb -- ask Seth? Maybe
#max_rb <- 5

# Subset 10x data based on thresholds
dat <- dat |>
  subset(
    nCount_RNA >= low_RNA &
      nCount_ATAC >= low_ATAC & 
      percent.mt < max_mt # &
      # percent.ribo < max_rb
    )

# Save object
saveRDS(dat, "data/processed/seurat_object/01_dat_10x_filtered.rds")

# Create a filtered fragment file to be used in the amulet doublet finder ------

# Path to fragment file with all barcodes included
data_dir <- here::here("data/processed/seq/all/outs/")
frag_path <- paste0(data_dir, "atac_fragments.tsv.gz")

# Write the file with fragments from only the filtered barcodes
FilterCells(frag_path,
            Cells(dat),
            outfile = paste0(data_dir, "atac_fragments_cells.tsv.gz"))
02_initial_cluster.R
Code
# ------------------------------------------------------------------------------
# Dimension reduction and clustering
# May 11, 2023
# TS O'Leary
# ------------------------------------------------------------------------------

# Load libraries
library(tidyverse)
library(Seurat)
library(Signac)
library(clustree)

# Load data
dat <- readRDS(
  here::here("data/processed/seurat_object/01_dat_10x_filtered.rds")
)

# Run the standard workflow for visualization and clustering -------------------
# This initial clustering is just for the multiplets plots

# Set RNA to the default assay set for the following set of operations 
DefaultAssay(dat) <- "RNA"

# Log-Normalize data
dat <- NormalizeData(dat)

# Find variable features
dat <- FindVariableFeatures(dat)

# Scale and center data
dat <- ScaleData(dat)

# Run principle component analysis 
dat <- RunPCA(dat)

# Run UMAP
dat <- RunUMAP(dat, dims = 1:30)

# Construct nearest-neighbor graph
dat <- FindNeighbors(dat, dims = 1:30)

# Finding the correct resolution of cluster -----
# Run on a bunch of different resolutions
dat <- FindClusters(dat, resolution = seq(0.01, 0.1, 0.01))
# Look at the cluster tree
clustree::clustree(dat)

# Remove the RNA_snn columns added
dat@meta.data <- dat@meta.data |>
  select(!contains("RNA_snn"))

# Set the final clusters
dat <- FindClusters(dat, resolution = 0.03)

# Save data
saveRDS(
  dat, 
  here::here("data/processed/seurat_object/02_dat_initial_cluster.rds")
)
04_doublet_finder.R
Code
# ------------------------------------------------------------------------------
# Detecting multiplets in the multiome data sets
# DoubleFinder and amulet
# May 22, 2023
# TS O'Leary
# ------------------------------------------------------------------------------

# Load libraries
library(tidyverse)
library(Seurat)
library(DoubletFinder)

# DoubletFinder for RNA doublets -----------------------------------------------

# Define expected doublet rate from 10X estimates
doublet_rate <- 0.008/1000
exp_double_rate <- doublet_rate*4000

# Each sample must be done separately per the instructions of DoubletFinder

# 18C_Rep1 ---------------------------------------------------------------------

# Load data
dat <- readRDS(
  here::here("data/processed/seurat_object/02_dat_10x_filtered.rds")) |>
  subset(sample_name == "18C_Rep1")

# Pre-process Seurat object ----------------------------------------------------
DefaultAssay(dat) <- "RNA"
dat <- NormalizeData(dat)
dat <- FindVariableFeatures(dat, selection.method = "vst", nfeatures = 2000)
dat <- ScaleData(dat)
dat <- RunPCA(dat)
dat <- RunUMAP(dat, dims = 1:10)
dat <- FindNeighbors(dat, dims = 1:10)
dat <- FindClusters(dat, resolution = 0.1)

# pK Identification ------------------------------------------------------------
sweep.res.list_dat <- paramSweep_v3(dat, PCs = 1:10, sct = FALSE)
sweep.stats_dat <- summarizeSweep(sweep.res.list_dat, GT = FALSE)
bcmvn_dat <- find.pK(sweep.stats_dat)

# Homotypic Doublet Proportion Estimate ----------------------------------------
homotypic.prop <- modelHomotypic(dat@meta.data$seurat_clusters)
nExp_poi <- round(exp_double_rate * nrow(dat@meta.data))
nExp_poi.adj <- round(nExp_poi*(1 - homotypic.prop))

# Run DoubletFinder with varying classification stringencies -------------------
dat <- doubletFinder_v3(dat, 
                        PCs = 1:10,
                        pK = 0.23,
                        nExp = nExp_poi.adj,
                        reuse.pANN = FALSE, 
                        sct = FALSE)

# Save the classified doublets
doublets_18C_Rep1 <- dat@meta.data |>
  subset(DF.classifications_0.25_0.23_61 == "Doublet") |>
  rownames()

# 18C_Rep2 ---------------------------------------------------------------------

# Load data
dat <- readRDS(
  here::here("data/processed/seurat_object/02_dat_10x_filtered.rds")) |>
  subset(sample_name == "18C_Rep2")

# Pre-process Seurat object ----------------------------------------------------
DefaultAssay(dat) <- "RNA"
dat <- NormalizeData(dat)
dat <- FindVariableFeatures(dat, selection.method = "vst", nfeatures = 2000)
dat <- ScaleData(dat)
dat <- RunPCA(dat)
dat <- RunUMAP(dat, dims = 1:10)
dat <- FindNeighbors(dat, dims = 1:10)
dat <- FindClusters(dat, resolution = 0.1)

# pK Identification ------------------------------------------------------------
sweep.res.list_dat <- paramSweep_v3(dat, PCs = 1:10, sct = FALSE)
sweep.stats_dat <- summarizeSweep(sweep.res.list_dat, GT = FALSE)
bcmvn_dat <- find.pK(sweep.stats_dat)

# Homotypic Doublet Proportion Estimate ----------------------------------------
homotypic.prop <- modelHomotypic(dat@meta.data$seurat_clusters)
nExp_poi <- round(exp_double_rate * nrow(dat@meta.data))
nExp_poi.adj <- round(nExp_poi*(1 - homotypic.prop))

# Run DoubletFinder with varying classification stringencies -------------------
dat <- doubletFinder_v3(dat, 
                        PCs = 1:10,
                        pK = 0.005,
                        nExp = nExp_poi.adj,
                        reuse.pANN = FALSE, 
                        sct = FALSE)

# Save the classified doublets
doublets_18C_Rep2 <- dat@meta.data |>
  subset(DF.classifications_0.25_0.005_103 == "Doublet") |>
  rownames()


# 25C_Rep1 ---------------------------------------------------------------------

# Load data
dat <- readRDS(
  here::here("data/processed/seurat_object/02_dat_10x_filtered.rds")) |>
  subset(sample_name == "25C_Rep1")

# Pre-process Seurat object ----------------------------------------------------
DefaultAssay(dat) <- "RNA"
dat <- NormalizeData(dat)
dat <- FindVariableFeatures(dat, selection.method = "vst", nfeatures = 2000)
dat <- ScaleData(dat)
dat <- RunPCA(dat)
dat <- RunUMAP(dat, dims = 1:10)
dat <- FindNeighbors(dat, dims = 1:10)
dat <- FindClusters(dat, resolution = 0.1)

# pK Identification ------------------------------------------------------------
sweep.res.list_dat <- paramSweep_v3(dat, PCs = 1:10, sct = FALSE)
sweep.stats_dat <- summarizeSweep(sweep.res.list_dat, GT = FALSE)
bcmvn_dat <- find.pK(sweep.stats_dat)

# Homotypic Doublet Proportion Estimate ----------------------------------------
homotypic.prop <- modelHomotypic(dat@meta.data$seurat_clusters)
nExp_poi <- round(exp_double_rate * nrow(dat@meta.data))
nExp_poi.adj <- round(nExp_poi*(1 - homotypic.prop))

# Run DoubletFinder with varying classification stringencies -------------------
dat <- doubletFinder_v3(dat, 
                        PCs = 1:10,
                        pK = 0.3,
                        nExp = nExp_poi.adj,
                        reuse.pANN = FALSE, 
                        sct = FALSE)

# Save the classified doublets
doublets_25C_Rep1 <- dat@meta.data |>
  subset(DF.classifications_0.25_0.3_63 == "Doublet") |>
  rownames()

# 25C_Rep2 ---------------------------------------------------------------------

# Load data
dat <- readRDS(
  here::here("data/processed/seurat_object/02_dat_10x_filtered.rds")) |>
  subset(sample_name == "25C_Rep2")

# Pre-process Seurat object ----------------------------------------------------
DefaultAssay(dat) <- "RNA"
dat <- NormalizeData(dat)
dat <- FindVariableFeatures(dat, selection.method = "vst", nfeatures = 2000)
dat <- ScaleData(dat)
dat <- RunPCA(dat)
dat <- RunUMAP(dat, dims = 1:10)
dat <- FindNeighbors(dat, dims = 1:10)
dat <- FindClusters(dat, resolution = 0.1)

# pK Identification ------------------------------------------------------------
sweep.res.list_dat <- paramSweep_v3(dat, PCs = 1:10, sct = FALSE)
sweep.stats_dat <- summarizeSweep(sweep.res.list_dat, GT = FALSE)
bcmvn_dat <- find.pK(sweep.stats_dat)

# Homotypic Doublet Proportion Estimate ----------------------------------------
homotypic.prop <- modelHomotypic(dat@meta.data$seurat_clusters)
nExp_poi <- round(exp_double_rate * nrow(dat@meta.data))
nExp_poi.adj <- round(nExp_poi*(1 - homotypic.prop))

# Run DoubletFinder with varying classification stringencies -------------------
dat <- doubletFinder_v3(dat, 
                        PCs = 1:10,
                        pK = 0.01,
                        nExp = nExp_poi.adj,
                        reuse.pANN = FALSE, 
                        sct = FALSE)

# Save the classified doublets
doublets_25C_Rep2 <- dat@meta.data |>
  subset(DF.classifications_0.25_0.01_72 == "Doublet") |>
  rownames()

doublets <- c(doublets_18C_Rep1,
              doublets_18C_Rep2,
              doublets_25C_Rep1,
              doublets_25C_Rep2)

# Save doublets
saveRDS(doublets, here::here("data/processed/qc/doublet_finder.rds"))
05_rm_multiplets.R
Code
# ------------------------------------------------------------------------------
# Remove multiplets
# May 23, 2023
# TS O'Leary
# ------------------------------------------------------------------------------

# Load libraries
require(tidyverse)
require(Seurat)

# Load multiplet barcodes
doublet_amulet <- readRDS(
  here::here("data/processed/qc/doublet_amulet.rds")
  ) |>
  filter(q.value < 0.05) |>
  rownames()
doublet_finder <- readRDS(here::here("data/processed/qc/doublet_finder.rds"))

# Load seurat object
dat <- readRDS(
  here::here("data/processed/seurat_object/02_dat_initial_cluster.rds")
) 

# Mark amulet described multiplet barcodes
dat$doublet_amulet <- ifelse(colnames(dat) %in% doublet_amulet, 
                             "Multiplet",
                             "Singlet")

# Mark amulet described multiplet barcodes
dat$doublet_finder <- ifelse(colnames(dat) %in% doublet_finder,
                             "Multiplet",
                             "Singlet")

# Save dat with multiplets marked
saveRDS(dat, here::here("data/processed/seurat_object/05_dat_multiplets.rds"))

# Remove all multiplets from the data set
dat <- dat |>
  subset(doublet_amulet == "Singlet" & doublet_finder == "Singlet")

# There are still two outlier barcodes with exceptionally high nCount_RNA when
# compared to the remaining high quality nuclei – 12-13k compared to 6-7k for 
# the next highest cells -- so let's remove those two outliers now
dat <- dat |> 
  subset(nCount_RNA < 10000)

# Remove cluster info from that initial clustering
dat@meta.data <- dat@meta.data |>
  select(-seurat_clusters) |>
  select(!contains("snn"))

# Save dat with multiplets removed
saveRDS(dat, here::here("data/processed/seurat_object/05_dat_filtered.rds"))
06_qc_data_cells.R
Code
# ------------------------------------------------------------------------------
# Quality control and filtering of low-quality cells
# May 25, 2023
# TS O'Leary
# ------------------------------------------------------------------------------

# Description -----
# Quality control metrics of final cells

# Load libraries
library(tidyverse)
library(Seurat)
library(Signac)
library(AnnotationHub)

# Load 10X filtered data
dat <- readRDS(
  here::here("data/processed/seurat_object/05_dat_filtered.rds")
)

# Load path to fragment file for later counting
data_dir <- here::here("data/processed/seq/all/outs/")
frag_path <- paste0(data_dir, "atac_fragments.tsv.gz")

# Get gene annotations for dm6 -------------------------------------------------
BDGP6.32 <- query(AnnotationHub(), 
                  c("EnsDb", "Drosophila melanogaster", "109"))[[1]]
annotation <- GetGRangesFromEnsDb(BDGP6.32)
seqlevelsStyle(annotation) <- 'UCSC'

# Calculate additional QC metrics ----------------------------------------------

# Calculate TSS Enrichment matrix for the cells that are left
DefaultAssay(dat) <- "ATAC"
dat <- TSSEnrichment(dat, fast = FALSE)

# Counting fraction of reads in peaks -----

# Count fragments
total_fragments <- CountFragments(
  fragments = frag_path, 
  cells = Cells(dat)
)

# Add to metadata
dat$fragments <- total_fragments[total_fragments$CB == colnames(dat), 
                                 "frequency_count"]

# Peak calling -----------------------------------------------------------------
# Installed macs2 using the R package Herper

# Path to miniconda
path_to_miniconda <-
  "/slipstream_old/home/thomasoleary/.local/share/r-miniconda" 
# Path to macs2 for CallPeaks function
macs2_path <- paste0(path_to_miniconda, 
                     "/envs/PeakCalling_analysis/bin/macs2")

# Call peaks
peaks <- CallPeaks(
  dat, 
  macs2.path = macs2_path,
  effective.genome.size = 1.25e8
)

# Create the peak matrix
peak_matrix <- FeatureMatrix(
  fragments = Fragments(dat),
  features = peaks
)

# Create a new assay using the MACS2 peak set and add it to the Seurat object
dat[["peaks"]] <- CreateChromatinAssay(
  counts = peak_matrix,
  fragments = frag_path,
  annotation = annotation
)

# Calculate fraction of reads in peaks
dat <- FRiP(
  dat,
  assay = "peaks", 
  total.fragments = "fragments"
)

# Fraction of reads in TSS region ----------------------------------------------

# Get the TSS sites
TSS_sites <- GetTSSPositions(
  dat@assays$ATAC@annotation
)

# # Get the 2000 bp upstream and 200 bp downstream region with promoter
# # This is how TSS was defined in Calderon et al. 2022
promoter_region <- promoters(TSS_sites)

dat$nCount_TSS <- CountsInRegion(
  dat, 
  assay = "ATAC", 
  regions = TSS_sites
)

dat$nCount_promoter <- CountsInRegion(
  dat, 
  assay = "ATAC", 
  regions = promoter_region
)

# Calculate
dat$FRiT <- dat$nCount_TSS/dat$fragments

# Save seurat object with added info
saveRDS(dat, here::here("data/processed/seurat_object/06_dat_qc.rds"))
07_cluster.R
Code
# ------------------------------------------------------------------------------
# Dimension reduction and clustering
# May 30, 2023
# TS O'Leary
# ------------------------------------------------------------------------------

# Load libraries
library(tidyverse)
library(Seurat)
library(Signac)
library(clustree)

# Load data
dat <- readRDS(
  here::here("data/processed/seurat_object/06_dat_qc.rds")
)

# Joint Clustering based on Signac tutorial ------------------------------------
# https://stuartlab.org/signac/articles/pbmc_multiomic.html

# RNA data processing
DefaultAssay(dat) <- "RNA"
dat <- SCTransform(dat)
dat <- RunPCA(dat)

# ATAC data processing
DefaultAssay(dat) <- "peaks"
dat <- FindTopFeatures(dat, min.cutoff = 5)
dat <- RunTFIDF(dat)
dat <- RunSVD(dat)

# The first lsi component often captures sequencing depth rather than biological 
# variation so we should not use this first component in further analysis.
# See plot below for ~-1 corr with depth and the first component
DepthCor(dat)

# Build a joint neighbor graph using both assays
dat <- FindMultiModalNeighbors(
  object = dat,
  reduction.list = list("pca", "lsi"), 
  dims.list = list(1:50, 2:40),
  verbose = TRUE
)

# Build a joint UMAP
dat <- RunUMAP(
  object = dat,
  nn.name = "weighted.nn",
  assay = "RNA",
  verbose = TRUE
)

# Build a joint t-SNE
dat <- RunTSNE(
  object = dat,
  nn.name = "weighted.nn",
  assay = "RNA",
  verbose = TRUE
)

# Find Clusters based on the LSI
dat <- FindClusters(dat, 
                    resolution = 0.2,
                    graph.name = "wknn", 
                    algorithm = 3)

# Save data
saveRDS(dat, here::here("data/processed/seurat_object/07_dat_cluster.rds"))

################################################################################
# To see how and if the dimensional reduction of only ATAC or RNA libraries 
# looks different -- save these data below

# RNA-only dim-reduction -------------------------------------------------------
# Load data
dat <- readRDS(
  here::here("data/processed/seurat_object/07_dat_cluster.rds")
)

# RNA data processing
DefaultAssay(dat) <- "RNA"
dat <- SCTransform(dat)
dat <- RunUMAP(RunPCA(dat), dims = c(1:50))

# Save dat with RNA only UMAP
saveRDS(dat,
  here::here("data/processed/seurat_object/07_dat_rna_umap.rds")
)

# ATAC-only dim-reduction ------------------------------------------------------
# Load data
dat <- readRDS(
  here::here("data/processed/seurat_object/07_dat_cluster.rds")
)
 
# ATAC UMAP quick
DefaultAssay(dat) <- "peaks"
dat <- dat |> 
  ScaleData() |>
  RunPCA() |>
  RunUMAP(dims = c(1:50))

# Save dat with ATAC only UMAP
saveRDS(dat,
  here::here("data/processed/seurat_object/07_dat_atac_umap.rds")
)
08_cluster_markers.R
Code
# ------------------------------------------------------------------------------
# Differential expression and accessibility
# June 13, 2023
# TS O'Leary
# ------------------------------------------------------------------------------

# Load libraries
library(tidyverse)
library(Seurat)
library(Signac)

# Load data
dat <- readRDS(here::here("data/processed/seurat_object/07_dat_cluster.rds"))

# Find markers of each cluster that are conserved between the two conditions ---

# Create empty list to populate in the for loop
markers <- list()

# Loop through each cluster individually
for (i in levels(dat)) {
  markers[[i]] <- FindConservedMarkers(dat, 
                                       ident.1 = i,
                                       grouping.var = "acc_temp",
                                       only.pos = TRUE) |> 
    rownames_to_column("gene")
}

# Combine markers for all clusters
markers <- bind_rows(markers, 
                     .id = "cluster")

# Number of markers per cluster
markers %>%
  filter(max_pval < 0.05) |> 
  group_by(cluster) |> 
  tally()

# Save markers object
saveRDS(markers, here::here("data/processed/genes/markers.rds"))
09_cluster_annotation.R
Code
# ------------------------------------------------------------------------------
# Annotating Seurat Clusters using Fisher's enrichment of BDGP in situ data
# June 12, 2023
# TS O'Leary
# ------------------------------------------------------------------------------

# Load libraries
library(tidyverse)

# Load data --------------------------------------------------------------------

# Berkeley Drosophila Genome Project in situ database
# https://insitu.fruitfly.org/insitu-mysql-dump/insitu_annot.csv.gz
insitu_annot <- read_csv(here::here("data/raw/annot/insitu_annot.csv"))

# Cluster annotations from Calderon data at 4 to 6 hours 
cluster_annot_calderon <- read_csv(
  here::here("data/raw/annot/calderon_cluster_annot.csv")) |>
  filter(`time window (modeled)` == "4-6") |>
  dplyr::rename(cluster = `cluster #`) |>
  dplyr::select(cluster, `selected annotation (manual)`) |> 
  distinct(`selected annotation (manual)`)

# Marker genes from the Seurat clusters -- all markers with pval < 0.05
markers <- readRDS(here::here("data/processed/genes/markers.rds")) |> 
  group_by(cluster) |> 
  filter(max_pval < 0.05)


# Fisher's Exact Test for enrichment of cell-type specific marker genes --------

# Reference panel of a specific level of the in situ data
ref_panel <- insitu_annot
cluster_annot <- NULL

pb <- progress::progress_bar$new(
  total = length(unique(markers$cluster)) * length(unique(ref_panel$annot)))
# Loop through each cluster present in the query markers data
for (cluster_i in unique(markers$cluster)) {
  # Unique markers for a cluster
  query_markers <- markers |> 
    filter(cluster == cluster_i) |> 
    ungroup() |> 
    distinct(gene) |> 
    deframe()
  
  # Loop through each annotation type in the reference
  for (ref_annot_j in unique(ref_panel$annot)) {
    
    # Distinct reference markers for this annotation
    ref_markers <- ref_panel |> 
      filter(annot == ref_annot_j) |> 
      distinct(gene) |> 
      deframe()
    
    # Collect numbers for the Fisher's exact test contingency table ------------
    
    # Number of markers in common
    n_common <- length(intersect(query_markers, ref_markers))
    # Number unique to the query
    n_query_unique <- length(setdiff(query_markers, ref_markers))
    # Number unique to the reference
    n_ref_unique <- length(setdiff(ref_markers, query_markers))     
    # Number remaining
    n_remain <- length(unique(c(unique(markers$gene), rownames(ref_panel)))) - 
      n_common - n_query_unique - n_ref_unique
    
    # Proportion of query markers that are shared with the reference
    prop_overlap <- n_common/length(query_markers)
    
    # Contingency table
    con_table <- matrix(
      c(n_common, 
        n_query_unique, 
        n_ref_unique, 
        n_remain), 
      ncol = 2)
    
    # Run the Fisher's exact test and save the p-value
    fet_pval <- fisher.test(con_table)$p.value
    
    # Store results
    df <- tibble(
      cluster = cluster_i,
      annot = ref_annot_j,
      prop_overlap = prop_overlap,
      pval = fet_pval
    )
    
    # Bind results together into a dataframe
    cluster_annot <- bind_rows(df, cluster_annot)
    
    # Print progress bar
    pb$tick()
  }
}

pb$terminate()

# Adjusted pval for FDR correction
cluster_annot <- cluster_annot %>%
  mutate(padj = p.adjust(pval, method = "BH"))

# Save the full annotation results
saveRDS(cluster_annot, here::here("data/processed/annot/cluster_annot_all.rds"))

# Manual annotations by TSO using the 11 annotations present in at 4 to 6 hours
# of development in the Calderon data
cluster_annot_manual <- c(
  "0" = "ectoderm primordium",
  "1" = "mesoderm primordium",
  "2" = "ventral nerve cord primordium",
  "3" = "endoderm primordium",
  "4" = "ectoderm primordium",
  "5" = "muscle system primordium",
  "6" = "tracheal primordium",
  "7" = "ventral nerve cord primordium",
  "8" = "plasmatocytes anlage",
  "9" = "amnioserosa",
  "10" = "unknown"
)

# Add in the consensus results
cluster_annot_manual <- cluster_annot |> 
  filter(padj < 0.05) |> 
  mutate(cluster = as.numeric(cluster)) |> 
  group_by(cluster) |> 
  slice_min(padj, n = 3) |> 
  summarise(top_3 = paste0(annot, collapse = "; ")) |> 
  mutate(annot_manual = cluster_annot_manual) |> 
  select(cluster, annot_manual, top_3)

# Save the consensus results
saveRDS(
  cluster_annot_manual, 
  here::here("data/processed/annot/cluster_annot_manual.rds")
)
10_link_peaks.R
Code
# ------------------------------------------------------------------------------
# Link peaks to genes
# June 06, 2023
# TS O'Leary
# ------------------------------------------------------------------------------

# Load libraries
library(tidyverse)
library(Seurat)
library(Signac)
library(BSgenome.Dmelanogaster.UCSC.dm6)

# Load data
dat <- readRDS(here::here("data/processed/seurat_object/07_dat_cluster.rds"))
seqlevelsStyle(BSgenome.Dmelanogaster.UCSC.dm6) <- "NCBI"

# Set assay
DefaultAssay(dat) <- "peaks"

# Compute the GC content for each peak
dat <- RegionStats(
  dat, 
  genome = BSgenome.Dmelanogaster.UCSC.dm6
)

# Link peaks to genes
dat <- LinkPeaks(
  object = dat,
  peak.assay = "peaks",
  expression.assay = "SCT"
)

# Save data
saveRDS(dat, here::here("data/processed/seurat_object/10_dat_linked.rds"))
11_pseudobulk_DESeq2.R
Code
# ------------------------------------------------------------------------------
# Pseudobulk analysis
# June 08, 2023
# TS O'Leary
# ------------------------------------------------------------------------------

# Load libraries
require(tidyverse)
require(Seurat)
require(Signac)
require(DESeq2)

# Load data
dat <- readRDS(
  here::here("data/processed/seurat_object/06_dat_qc.rds")
)

# Aggregate samples data into counts matrix
counts <- AggregateExpression(dat, 
                              group.by = "sample_name",
                              assays = "RNA",
                              slot = "counts",
                              return.seurat = FALSE)[[1]]

# Create metadata for counts matrix
metadata <- tibble(
  samples = c("18C_Rep1","18C_Rep2","25C_Rep1","25C_Rep1"),
  acc_temp = as.factor(c("18C", "18C", "25C", "25C"))
)

# Create DESeq Data Set
dds <- DESeqDataSetFromMatrix(countData = counts,
                              colData = metadata,
                              design = ~ acc_temp)

# Filter out genes with low counts across the four samples
keep_genes <- rowSums(counts(dds)) >= 10
dds <- dds[keep_genes,]

# Run DESeq2
dds <- DESeq(dds)

# Generate results object
res <- results(dds, name = "acc_temp_25C_vs_18C")
deg_res <- res |>
  as_tibble(rownames = "gene")
degs <- res |>
  as_tibble(rownames = "gene") |>
  dplyr::filter(padj< 0.05) |>
  arrange(padj)

# Save results
saveRDS(res, here::here("output/degs/pseudobulk_DESeq_res.rds"))
saveRDS(degs, here::here("output/degs/pseudobulk_DESeq.rds"))
write_csv(degs |> select(gene), here::here("output/degs/pseudobulk_DESeq_gene_names.csv"))
Note

The code used for analysis is located on GitHub

1 Quality control

Details on calling high-quality nuceli barcodes
  1. We used Cell Ranger ARC called cell barcodes – algorithm described here. 14,301 barcodes out of 2,215,183.

  2. Because the Cell Ranger ARC cell calling algorithm is very permissive to barcodes with very low counts (i.e., a minimum of a single count in each library), barcodes were additionally filtered to a low count threshold in both the ATAC and RNA libraries based on the clearly defined population of cells in the RNA and ATAC count scatterplot. Additionally, barcodes with more than 5% of RNA reads mapping to genes on the mitochondrial genome were excluded. A total of 886 barcodes were filtered out for 13,145 left.

  3. Multiplets were filtered out using two independent methods, relying on either the ATAC or RNA libraries to call multiplets. AMULET relies on the assumption that in snATAC-seq of diploids there should be at most two overlapping fragments with the same cell barcode. The presence of more than two overlapping fragments is a potential indication of a multiplet. Doublets were also identified using the RNA-seq libraries with DoubletFinder. After removing the 1,103 multiplet barcodes, there were still two barcodes that had substantially higher number of RNA reads after all filtering (i.e., n_Count_RNA around 12-13k when the next highest are around 6-7k). So I removed those two for a final number of 12,040 high quality nuceli barcodes.

1.1 Knee plot

1.2 Scatter plot of counts

1.3 Violin QC plots

1.4 Multiplets

1.4.1 Count histograms

1.5 QC metrics per sample

1.6 QC metrics on UMAP

2 Seurat

2.1 Clusters

Cells per cluster

2.1.1 Acclimation clusters

2.2 Annotation

2.2.1 Annotation results

cluster annot prop_overlap pval padj
0 ventral epidermis primordium 0.0532787 0.0000000 0.0000000
0 dorsal epidermis primordium 0.0532787 0.0000000 0.0000000
0 dorsal ectoderm primordium 0.0368852 0.0000002 0.0000159
0 embryonic dorsal epidermis 0.0491803 0.0000007 0.0000484
0 tracheal primordium 0.0245902 0.0000098 0.0004196
0 embryonic ventral epidermis 0.0409836 0.0000121 0.0005023
0 dorsal epidermis anlage 0.0122951 0.0000838 0.0026510
0 ventral ectoderm anlage in statu nascendi 0.0204918 0.0003854 0.0098415
0 subset 0.0245902 0.0005109 0.0121885
0 foregut primordium 0.0245902 0.0005463 0.0126547
0 clypeo-labral primordium 0.0163934 0.0006188 0.0141685
0 hindgut proper primordium 0.0286885 0.0008120 0.0179733
0 ventral ectoderm primordium 0.0204918 0.0010421 0.0218520
0 embryonic/larval tracheal system 0.0204918 0.0013134 0.0268347
0 embryonic epipharynx 0.0204918 0.0018848 0.0362357
0 ventral epidermis primordium P2 0.0081967 0.0020118 0.0379853
0 atrium 0.0163934 0.0026519 0.0465421
1 trunk mesoderm primordium 0.0585106 0.0000053 0.0002496
1 muscle system primordium 0.0265957 0.0001090 0.0033143
1 somatic muscle primordium 0.0265957 0.0004709 0.0115811
1 longitudinal visceral mesoderm primordium 0.0159574 0.0013004 0.0267054
2 ventral nerve cord primordium 0.0388350 0.0000317 0.0011599
2 brain primordium 0.0388350 0.0000503 0.0016699
2 ventral nerve cord primordium P3 0.0258900 0.0002285 0.0063667
2 ventral ectoderm primordium P2 0.0226537 0.0002856 0.0076374
2 no staining 0.0194175 0.0014403 0.0289806
2 embryonic brain 0.0388350 0.0020681 0.0388641
2 central brain primordium P3 0.0129450 0.0022350 0.0412232
3 posterior midgut primordium 0.0996785 0.0000000 0.0000000
3 anterior midgut primordium 0.0932476 0.0000000 0.0000000
3 posterior endoderm primordium 0.0707395 0.0000000 0.0000000
3 anterior endoderm primordium 0.0675241 0.0000000 0.0000000
3 embryonic midgut 0.0868167 0.0000000 0.0000000
3 embryonic/larval muscle system 0.0546624 0.0000000 0.0000000
3 trunk mesoderm primordium 0.0610932 0.0000000 0.0000002
3 posterior endoderm primordium P2 0.0450161 0.0000000 0.0000005
3 somatic muscle primordium 0.0353698 0.0000000 0.0000013
3 embryonic/larval visceral muscle 0.0385852 0.0000000 0.0000033
3 dorsal prothoracic pharyngeal muscle 0.0385852 0.0000002 0.0000130
3 dorsal pharyngeal muscle primordium 0.0257235 0.0000004 0.0000302
3 visceral muscle primordium 0.0289389 0.0000010 0.0000640
3 anterior endoderm anlage 0.0353698 0.0000011 0.0000647
3 embryonic/larval somatic muscle 0.0321543 0.0000011 0.0000650
3 muscle system primordium 0.0257235 0.0000012 0.0000718
3 embryonic anal pad 0.0289389 0.0000033 0.0001646
3 salivary gland primordium 0.0192926 0.0000039 0.0001888
3 head mesoderm primordium P2 0.0353698 0.0000113 0.0004780
3 head mesoderm primordium 0.0257235 0.0000225 0.0008548
3 cellular blastoderm 0.0385852 0.0000415 0.0014407
3 embryonic hindgut 0.0385852 0.0000448 0.0015240
3 maternal 0.1028939 0.0000890 0.0027922
3 salivary gland body primordium 0.0192926 0.0002808 0.0076106
3 longitudinal visceral mesoderm primordium 0.0128617 0.0004456 0.0110955
3 embryonic/larval fat body 0.0192926 0.0005423 0.0126349
3 foregut primordium 0.0192926 0.0018918 0.0362357
3 trunk mesoderm anlage 0.0064309 0.0019156 0.0365160
3 head mesoderm primordium P4 0.0225080 0.0021003 0.0392842
3 ubiquitous 0.0643087 0.0021180 0.0394311
4 ventral epidermis primordium 0.0437158 0.0000002 0.0000168
4 dorsal ectoderm primordium 0.0382514 0.0000036 0.0001738
4 dorsal epidermis primordium 0.0382514 0.0000095 0.0004122
4 tracheal primordium 0.0273224 0.0000335 0.0012120
4 embryonic ventral epidermis 0.0437158 0.0000569 0.0018588
4 embryonic dorsal epidermis 0.0437158 0.0001155 0.0034612
4 embryonic/larval tracheal system 0.0273224 0.0003611 0.0093414
4 subset 0.0273224 0.0009376 0.0204129
4 hindgut proper primordium 0.0327869 0.0009616 0.0205963
4 ventral epidermis primordium P2 0.0109290 0.0011416 0.0236881
4 embryonic foregut 0.0273224 0.0016159 0.0318710
5 muscle system primordium 0.0307692 0.0003132 0.0082638
5 visceral muscle primordium 0.0307692 0.0009039 0.0197870
5 longitudinal visceral muscle fibers 0.0153846 0.0015871 0.0314581
6 tracheal primordium 0.0508475 0.0000000 0.0000000
6 embryonic/larval tracheal system 0.0451977 0.0000001 0.0000129
6 dorsal epidermis primordium 0.0451977 0.0000006 0.0000439
6 dorsal ectoderm primordium 0.0395480 0.0000029 0.0001485
6 embryonic ventral epidermis 0.0508475 0.0000059 0.0002705
6 embryonic dorsal epidermis 0.0508475 0.0000133 0.0005317
6 ventral epidermis primordium 0.0338983 0.0000303 0.0011268
6 primary segmental branch specific anlage 0.0112994 0.0001209 0.0035668
6 embryonic salivary gland duct 0.0169492 0.0001680 0.0048494
6 visceral branch specific anlage 0.0112994 0.0001842 0.0052796
6 dorsal trunk specific anlage 0.0112994 0.0005672 0.0130612
6 dorsal epidermis anlage 0.0112994 0.0017218 0.0334610
6 embryonic salivary gland common duct 0.0112994 0.0019356 0.0367202
6 embryonic head epidermis 0.0282486 0.0023524 0.0425996
7 ventral nerve cord primordium P3 0.0448980 0.0000001 0.0000074
7 brain primordium 0.0571429 0.0000001 0.0000117
7 ventral nerve cord primordium 0.0530612 0.0000005 0.0000369
7 embryonic brain 0.0653061 0.0000008 0.0000506
7 procephalic ectoderm primordium 0.0367347 0.0000020 0.0001084
7 procephalic ectoderm anlage 0.0326531 0.0000030 0.0001555
7 dorsal ectoderm anlage in statu nascendi 0.0285714 0.0000067 0.0003047
7 ventral nerve cord anlage 0.0244898 0.0000131 0.0005264
7 ventral ectoderm primordium 0.0285714 0.0000131 0.0005264
7 procephalic ectoderm anlage in statu nascendi 0.0244898 0.0000395 0.0013943
7 ventral nerve cord 0.0489796 0.0002790 0.0076106
7 ventral ectoderm anlage in statu nascendi 0.0204082 0.0003926 0.0099626
7 head mesoderm anlage in statu nascendi 0.0122449 0.0004129 0.0104104
7 amnioserosa anlage in statu nascendi 0.0122449 0.0004578 0.0113295
7 ectoderm anlage in statu nascendi 0.0122449 0.0004815 0.0117678
7 ubiquitous 0.0734694 0.0004957 0.0119689
7 subset 0.0244898 0.0005220 0.0123785
7 cellular blastoderm 0.0367347 0.0005275 0.0123939
7 central brain primordium P3 0.0163265 0.0009581 0.0205963
7 head epidermis dorsal anlage in statu nascendi 0.0081633 0.0011976 0.0247220
7 anlage in statu nascendi 0.0163265 0.0016566 0.0325115
7 anterior endoderm anlage in statu nascendi 0.0122449 0.0024786 0.0441834
7 yolk nuclei 0.0204082 0.0024842 0.0441834
7 maternal 0.0938776 0.0025898 0.0458062
8 plasmatocytes anlage 0.0365854 0.0000000 0.0000000
8 head mesoderm primordium P2 0.0560976 0.0000000 0.0000000
8 yolk nuclei 0.0439024 0.0000000 0.0000000
8 amnioserosa 0.0414634 0.0000000 0.0000000
8 plasmatocytes 0.0219512 0.0000000 0.0000001
8 garland cell primordium 0.0219512 0.0000000 0.0000001
8 head mesoderm primordium P4 0.0365854 0.0000000 0.0000019
8 embryonic salivary gland 0.0243902 0.0000004 0.0000286
8 posterior endoderm primordium P2 0.0317073 0.0000009 0.0000579
8 embryonic/larval garland cell 0.0195122 0.0000012 0.0000720
8 maternal 0.1073171 0.0000013 0.0000752
8 salivary gland body primordium 0.0219512 0.0000031 0.0001562
8 ubiquitous 0.0756098 0.0000034 0.0001658
8 crystal cell 0.0146341 0.0000082 0.0003637
8 embryonic/larval fat body 0.0219512 0.0000085 0.0003702
8 anterior endoderm anlage 0.0268293 0.0000145 0.0005656
8 posterior endoderm primordium 0.0317073 0.0000236 0.0008887
8 procrystal cell 0.0121951 0.0000430 0.0014782
8 anterior endoderm primordium 0.0292683 0.0000697 0.0022403
8 embryonic midgut 0.0487805 0.0000829 0.0026429
8 head mesoderm anlage in statu nascendi 0.0097561 0.0001014 0.0031303
8 cellular blastoderm 0.0317073 0.0001462 0.0042511
8 posterior midgut primordium 0.0390244 0.0002203 0.0062258
8 salivary gland primordium 0.0121951 0.0002226 0.0062441
8 trunk mesoderm primordium 0.0317073 0.0004389 0.0109985
8 embryonic dorsal epidermis 0.0268293 0.0004901 0.0119068
8 apically cleared 0.0097561 0.0008015 0.0178385
8 mesoderm anlage in statu nascendi 0.0097561 0.0010420 0.0218520
8 anterior endoderm anlage in statu nascendi 0.0097561 0.0010801 0.0225286
8 dorsal epidermis primordium 0.0170732 0.0013450 0.0272322
8 anterior midgut primordium 0.0341463 0.0013466 0.0272322
8 trunk mesoderm primordium P2 0.0219512 0.0015033 0.0300668
8 posterior endoderm anlage in statu nascendi 0.0097561 0.0017201 0.0334610
8 embryonic/larval tracheal system 0.0146341 0.0024406 0.0439969
8 ventral epidermis primordium 0.0146341 0.0025984 0.0458062
9 amnioserosa 0.0416667 0.0000000 0.0000000
9 yolk nuclei 0.0321970 0.0000000 0.0000000
9 embryonic salivary gland 0.0284091 0.0000000 0.0000000
9 embryonic dorsal epidermis 0.0416667 0.0000000 0.0000001
9 embryonic ventral epidermis 0.0378788 0.0000000 0.0000003
9 salivary gland body primordium 0.0246212 0.0000000 0.0000006
9 maternal 0.1098485 0.0000000 0.0000012
9 dorsal ectoderm primordium 0.0227273 0.0000004 0.0000275
9 embryonic epipharynx 0.0227273 0.0000005 0.0000398
9 embryonic midgut 0.0530303 0.0000007 0.0000484
9 embryonic hypopharynx 0.0227273 0.0000009 0.0000597
9 embryonic proventriculus 0.0227273 0.0000013 0.0000746
9 embryonic/larval tracheal system 0.0208333 0.0000017 0.0000917
9 dorsal epidermis primordium 0.0227273 0.0000017 0.0000917
9 subset 0.0227273 0.0000021 0.0001106
9 head mesoderm primordium P2 0.0284091 0.0000045 0.0002118
9 posterior midgut primordium 0.0416667 0.0000059 0.0002705
9 salivary gland primordium 0.0132576 0.0000071 0.0003163
9 ventral epidermis primordium 0.0189394 0.0000128 0.0005244
9 posterior endoderm primordium P2 0.0246212 0.0000136 0.0005350
9 dorsal ectoderm anlage in statu nascendi 0.0170455 0.0000212 0.0008104
9 garland cell primordium 0.0113636 0.0000305 0.0011268
9 anterior midgut primordium 0.0378788 0.0000353 0.0012685
9 ventral ectoderm primordium 0.0170455 0.0000470 0.0015869
9 embryonic hindgut 0.0303030 0.0000495 0.0016569
9 anterior endoderm primordium 0.0265152 0.0000520 0.0017106
9 apoptotic amnioserosa 0.0075758 0.0000578 0.0018711
9 ubiquitous 0.0625000 0.0000939 0.0029234
9 embryonic proventriculus outer layer 0.0075758 0.0001035 0.0031718
9 embryonic head epidermis 0.0208333 0.0001128 0.0034036
9 hindgut proper primordium 0.0227273 0.0001184 0.0035205
9 anterior endoderm anlage 0.0208333 0.0001389 0.0040680
9 embryonic foregut 0.0189394 0.0001873 0.0053299
9 atrium 0.0132576 0.0002623 0.0072568
9 head mesoderm anlage in statu nascendi 0.0075758 0.0002658 0.0073031
9 posterior endoderm primordium 0.0246212 0.0002910 0.0077299
9 ventral ectoderm primordium P2 0.0170455 0.0003336 0.0086855
9 foregut primordium 0.0170455 0.0003729 0.0095849
9 embryonic/larval garland cell 0.0113636 0.0005089 0.0121885
9 trunk mesoderm anlage in statu nascendi 0.0075758 0.0005289 0.0123939
9 procephalic ectoderm anlage 0.0151515 0.0006406 0.0145830
9 tracheal primordium 0.0113636 0.0006581 0.0148964
9 procephalon primordium 0.0056818 0.0007305 0.0164421
9 embryonic salivary gland body 0.0094697 0.0007527 0.0168459
9 head mesoderm primordium P4 0.0189394 0.0009603 0.0205963
9 plasmatocytes anlage 0.0094697 0.0009879 0.0210473
9 no staining 0.0946970 0.0010414 0.0218520
9 trunk mesoderm primordium 0.0265152 0.0015094 0.0300668
9 gnathal primordium 0.0056818 0.0017378 0.0336094
9 ventral ectoderm anlage in statu nascendi 0.0113636 0.0021914 0.0406077
9 clypeolabrum primordium P2 0.0056818 0.0022465 0.0412449
9 trunk mesoderm primordium P2 0.0189394 0.0024784 0.0441834
10 maternal 0.2123711 0.0000000 0.0000000
10 ubiquitous 0.1567010 0.0000000 0.0000000
10 trunk mesoderm primordium 0.0597938 0.0000000 0.0000000
10 embryonic midgut 0.0762887 0.0000000 0.0000000
10 faint ubiquitous 0.0701031 0.0000000 0.0000000
10 embryonic/larval muscle system 0.0474227 0.0000000 0.0000000
10 anterior midgut primordium 0.0577320 0.0000000 0.0000000
10 posterior midgut primordium 0.0577320 0.0000000 0.0000001
10 cellular blastoderm 0.0474227 0.0000000 0.0000001
10 head mesoderm primordium 0.0309278 0.0000000 0.0000001
10 trunk mesoderm primordium P2 0.0371134 0.0000000 0.0000005
10 anterior endoderm primordium 0.0391753 0.0000000 0.0000008
10 posterior endoderm primordium 0.0371134 0.0000001 0.0000067
10 anterior endoderm anlage 0.0309278 0.0000001 0.0000067
10 dorsal prothoracic pharyngeal muscle 0.0309278 0.0000001 0.0000074
10 somatic muscle primordium 0.0247423 0.0000001 0.0000117
10 posterior endoderm primordium P2 0.0309278 0.0000002 0.0000148
10 embryonic/larval visceral muscle 0.0268041 0.0000006 0.0000409
10 head mesoderm primordium P4 0.0288660 0.0000009 0.0000597
10 embryonic/larval somatic muscle 0.0247423 0.0000014 0.0000811
10 head mesoderm primordium P2 0.0309278 0.0000016 0.0000885
10 salivary gland body primordium 0.0185567 0.0000119 0.0004975
10 strong ubiquitous 0.0164948 0.0000209 0.0008080
10 embryonic brain 0.0412371 0.0000369 0.0013135
10 fat body specific anlage 0.0082474 0.0000416 0.0014407
10 brain primordium 0.0288660 0.0002837 0.0076357
10 ventral nerve cord 0.0371134 0.0003220 0.0084401
10 hindgut proper primordium 0.0206186 0.0009010 0.0197870
10 longitudinal visceral mesoderm primordium 0.0082474 0.0022860 0.0417766
10 embryonic hindgut 0.0247423 0.0023353 0.0424833

2.2.2 Manual annotations

cluster annot_manual top_3
0 ectoderm primordium ventral epidermis primordium; dorsal epidermis primordium; dorsal ectoderm primordium
1 mesoderm primordium trunk mesoderm primordium; muscle system primordium; somatic muscle primordium
2 ventral nerve cord primordium ventral nerve cord primordium; brain primordium; ventral nerve cord primordium P3
3 endoderm primordium posterior midgut primordium; anterior midgut primordium; posterior endoderm primordium
4 ectoderm primordium ventral epidermis primordium; dorsal ectoderm primordium; dorsal epidermis primordium
5 muscle system primordium muscle system primordium; visceral muscle primordium; longitudinal visceral muscle fibers
6 tracheal primordium tracheal primordium; embryonic/larval tracheal system; dorsal epidermis primordium
7 ventral nerve cord primordium ventral nerve cord primordium P3; brain primordium; ventral nerve cord primordium
8 plasmatocytes anlage plasmatocytes anlage; head mesoderm primordium P2; yolk nuclei
9 amnioserosa amnioserosa; yolk nuclei; embryonic salivary gland
10 unknown maternal; ubiquitous; trunk mesoderm primordium
Warning

Cluster 11 is likely also unknown or ubiquitous, as it does not have an annotation that passes FDR.

Session Info

R version 4.2.3 (2023-03-15)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.6 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] kableExtra_1.3.4 lubridate_1.9.2  forcats_1.0.0    stringr_1.5.0   
 [5] dplyr_1.1.2      purrr_1.0.1      readr_2.1.4      tidyr_1.3.0     
 [9] tibble_3.2.1     ggplot2_3.4.2    tidyverse_2.0.0 

loaded via a namespace (and not attached):
 [1] compiler_4.2.3    pillar_1.9.0      tools_4.2.3       digest_0.6.31    
 [5] viridisLite_0.4.2 timechange_0.2.0  jsonlite_1.8.5    evaluate_0.21    
 [9] lifecycle_1.0.3   gtable_0.3.3      png_0.1-8         pkgconfig_2.0.3  
[13] rlang_1.1.1       cli_3.6.1         rstudioapi_0.14   yaml_2.3.7       
[17] xfun_0.39         fastmap_1.1.1     xml2_1.3.4        httr_1.4.6       
[21] withr_2.5.0       knitr_1.43        systemfonts_1.0.4 generics_0.1.3   
[25] vctrs_0.6.2       htmlwidgets_1.6.2 hms_1.1.3         rprojroot_2.0.3  
[29] webshot_0.5.4     grid_4.2.3        tidyselect_1.2.0  here_1.0.1       
[33] svglite_2.1.1     glue_1.6.2        R6_2.5.1          fansi_1.0.4      
[37] rmarkdown_2.22    tzdb_0.4.0        magrittr_2.0.3    scales_1.2.1     
[41] htmltools_0.5.5   rvest_1.0.3       colorspace_2.1-0  utf8_1.2.3       
[45] stringi_1.7.12    munsell_0.5.0